rm(list = ls(all = TRUE))
gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  625156 33.4    1434672 76.7   702048 37.5
## Vcells 1171313  9.0    8388608 64.0  1927980 14.8
library(magrittr)
library(data.table)
library(knitr)
library(ggplot2)
library(ComplexUpset)


`%!in%` = Negate(`%in%`)
`%nin%` = Negate(`%in%`)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

1 Ath gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ath.gmm = gmm[gmm$plant == 'ath', ]
setDT(ath.gmm)
ath.gmm[, plant := NULL]
ath.gmm[, source := NULL]
colnames(ath.gmm)[2:4] = paste('ath', colnames(ath.gmm)[2:4], sep = '_')

2 Ath SKM & annotation

note: some duplicated ids in PSS

fp = file.path('..', 'input', 'ath-annot', 'Phytozome', 'PhytozomeV12', 
               'early_release', 'Athaliana_447_Araport11', 'annotation')
# fn = 'Araport11_GFF3_genes_transposons.current_utf8_attributes_CB.tsv'
fn = 'Athaliana_447_Araport11.geneName.txt'
gn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
colnames(gn)[2] = 'athName'
gn$V1 = sub('\\..*', '', gn$V1)
gn = gn[!duplicated(gn), ]


fn = 'Athaliana_447_Araport11.synonym.txt'
sn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
sn[, merged_column := apply(.SD, 1, function(x) {
  # Remove NA and empty strings
  x = x[!is.na(x) & x != ""]
  paste(x, collapse = " | ")
}), .SDcols = 2:ncol(sn)]
# Optionally, remove the original columns V2 to V15
sn[, (2:(ncol(sn)-1)) := NULL]
colnames(sn)[2] = 'athSynonims'
sn$V1 = sub('\\..*', '', sn$V1)
sn = sn[!duplicated(sn), ]


fp = file.path('..', 'input', 'SKM_2025-07-08')
fn = 'rxn-nodes-public.tsv'
pss = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ind = grep('^name$|^all_pathways|^short_name$', colnames(pss), value = TRUE)
pss = pss[, ..ind]
ind = grep('\\[', pss$name)
pss = pss[ind, ]

pss[, ids_string := stringr::str_extract(name, "(?<=\\[)[^\\]]+(?=\\])")]
pss[, ids_list := strsplit(ids_string, split = ",")]
max_ids = max(lengths(pss$ids_list))
for (i in seq_len(max_ids)) {
  pss[[paste0("id_", i)]] = sapply(pss$ids_list, function(x) ifelse(length(x) >= i, x[i], NA))
}
pss[, c("ids_string", "ids_list") := NULL]

pss_long = melt(
  pss,
  id.vars = c("name", "all_pathways", 'short_name'),       # Columns to keep as is
  measure.vars = patterns("^id_"),           # Columns to melt (all starting with "id_")
  variable.name = "id_num",                   # Name for the melted variable column
  value.name = "id"                           # Name for the melted value column
)

pss_long = pss_long[!is.na(id) & id != ""]
pss_long[, id_num := NULL]
pss_long[, name := NULL]
pss_long$id = sub('\\..*', '', pss_long$id)
pss_long = pss_long[!duplicated(pss_long), ]
table(duplicated(pss_long$id))
## 
## FALSE  TRUE 
##   816    24
pss_long %>%
  dplyr::filter(id %in% id[duplicated(id)] & stringr::str_starts(id, "^AT")) %>%
  dplyr::arrange(id) %>%
  print()
##                                              all_pathways short_name        id
##                                                    <char>     <char>    <char>
##  1:                               Hormone - Ethylene (ET)      ORA59 AT1G06160
##  2:                               Hormone - Ethylene (ET)  ERF/ORA59 AT1G06160
##  3:                         Hormone - Salicylic acid (SA)       TCP8 AT1G58100
##  4:                         Hormone - Salicylic acid (SA) TCP8,14,15 AT1G58100
##  5:                               Hormone - Ethylene (ET)       EDF2 AT1G68840
##  6:                               Hormone - Ethylene (ET)    ERF/EDF AT1G68840
##  7: Signalling - Heat-shock proteins (HSPs),Stress - Heat      HSP70 AT3G12580
##  8:               Signalling - Heat-shock proteins (HSPs)        HSP AT3G12580
##  9:                               Hormone - Ethylene (ET)       ERF1 AT3G23240
## 10:                               Hormone - Ethylene (ET)    ERF/EDF AT3G23240
## 11:                               Hormone - Ethylene (ET)       ERF6 AT4G17490
## 12:                               Hormone - Ethylene (ET)  ERF/ORA59 AT4G17490
## 13:                               Hormone - Ethylene (ET)       ERF1 AT4G17500
## 14:                               Hormone - Ethylene (ET)    ERF/EDF AT4G17500
## 15:               Signalling - Heat-shock proteins (HSPs)     MED37E AT5G02500
## 16:               Signalling - Heat-shock proteins (HSPs)        HSP AT5G02500
## 17:                               Hormone - Ethylene (ET)     ERF096 AT5G43410
## 18:                               Hormone - Ethylene (ET)    ERF/EDF AT5G43410
## 19:                               Hormone - Ethylene (ET)       ERF5 AT5G47230
## 20:                               Hormone - Ethylene (ET)  ERF/ORA59 AT5G47230
## 21:                               Hormone - Ethylene (ET)     ERF105 AT5G51190
## 22:                               Hormone - Ethylene (ET)  ERF/ORA59 AT5G51190
## 23:               Signalling - Heat-shock proteins (HSPs) HSP18.1-CI AT5G59720
## 24:               Signalling - Heat-shock proteins (HSPs)        HSP AT5G59720
## 25:                               Hormone - Ethylene (ET)     ERF104 AT5G61600
## 26:                               Hormone - Ethylene (ET)    ERF/EDF AT5G61600
##                                              all_pathways short_name        id
pss_long = pss_long %>%
  dplyr::filter(stringr::str_starts(id, "AT")) %>%
  dplyr::group_by(id) %>%
  dplyr::summarise(
    dplyr::across(
      .cols = dplyr::everything(),
      .fns = ~ {
        vals = unique(na.omit(.))
        if (length(vals) > 1) paste(vals, collapse = " | ")
        else if (length(vals) == 1) vals
        else NA_character_
      }
    ),
    .groups = "drop"
  )

pss_long[pss_long == ""] = "-"
gn[is.na(gn)] = "-"
sn[is.na(sn)] = "-"

ath.annot = pss_long %>%
  dplyr::full_join(gn, by = c("id" = "V1")) %>%
  dplyr::full_join(sn, by = c("id" = "V1"))

Note: 35.2 bin matcheswill be ignored

3 Abbreviations

Plant Name Label JCVI-MCScan Compara Plants Plaza OrthoDB FastOma RBH Mercator
Malus domestica apple mdo_GDv1 malus_domestica_golden mdo mdo mdo mdo mdo
mdo_HChap1
Prunus persica ppe ppe prunus_persica ppe ppe pper ppe pper
Prunus dulcis / P. amygdalus almond almond prunus_dulcis pdul pdul pdul pdul
Prunus avium wild cherry wildcherry prunus_avium pavi pavi pavi pavi
Prunus armeniaca apricot apricot parm parm parm parm
Prunus cerasifera cherry plum cherryplum pcer pcer pcer
Pyrus pear pear pcox pcox pcox
Prunus sibirica Siberian apricot siberianapricot psib psib psib psib
group = "fruitTrees"
params_list <- list(
  plantName = 'mdo'
  , plantNameOut = "apple"
  , plantDirOut = file.path('..', 'reports', group, "apple")
  , flag = 1
)

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x000002101a545080>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-10] | |…… | 11% | |…….. | 15% [unnamed-chunk-11] | |……… | 19% | |……….. | 22% [unnamed-chunk-12] | |…………. | 26% | |…………… | 30% [unnamed-chunk-13] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-14] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-15] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-16] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-17] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-18] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-19] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-20] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-21] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-22] | |……………………………………………| 100%

cat(child_content)

4 Subsection: mdo

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

4.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/_OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  20997 77128 32036 112138 30068 37682
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_plant to_geneID                   to_plant source 
##    <chr>       <chr>      <chr>                       <chr>    <chr>  
##  1 AT4G10330   ath        Maldo.hc.v1a1.ch10A.g00003  mdo      FastOMA
##  2 AT4G10340   ath        Maldo.hc.v1a1.ch10A.g00004  mdo      FastOMA
##  3 AT5G44410   ath        g1                          mdo      FastOMA
##  4 AT5G44440   ath        g1                          mdo      FastOMA
##  5 AT1G03770   ath        Maldo.hc.v1a1.ch10A.g02825  mdo      MCScanX
##  6 AT1G03800   ath        Maldo.hc.v1a1.ch10A.g02815  mdo      MCScanX
##  7 ATMG00910   ath        Maldo.hc.v1a1.sc45A.g49878  mdo      MCScanX
##  8 ATMG01020   ath        Maldo.hc.v1a1.sc45A.g49883  mdo      MCScanX
##  9 AT2G46320   ath        Maldo.hc.v1a1.ch1A.g26266   mdo      OrthoDB
## 10 AT4G27940   ath        Maldo.hc.v1a1.ch1A.g26266   mdo      OrthoDB
## 11 AT2G07695   ath        Maldo.hc.v1a1.sc164A.g48922 mdo      OrthoDB
## 12 AT2G07695   ath        Maldo.hc.v1a1.sc119A.g48697 mdo      OrthoDB
## 13 AT2G28190   ath        Maldo.hc.v1a1.ch6A.g38979   mdo      PLAZA  
## 14 AT2G07732   ath        Maldo.hc.v1a1.sc90A.g49976  mdo      PLAZA  
## 15 AT5G17400   ath        Maldo.hc.v1a1.ch9A.g48118   mdo      PLAZA  
## 16 AT2G47300   ath        Maldo.hc.v1a1.ch17A.g24110  mdo      PLAZA  
## 17 AT1G01020   ath        Maldo.hc.v1a1.ch7A.g41990   mdo      RBH    
## 18 AT1G01030   ath        Maldo.hc.v1a1.ch1A.g25187   mdo      RBH    
## 19 ATMG01410   ath        Maldo.hc.v1a1.sc36A.g49760  mdo      RBH    
## 20 ATMG01410   ath        Maldo.hc.v1a1.sc71A.g49908  mdo      RBH    
## 21 AT4G39400   ath        Maldo.hc.v1a1.ch2A.g27501   mdo      compara
## 22 AT4G39400   ath        Maldo.hc.v1a1.ch15A.g17036  mdo      compara
## 23 AT1G65880   ath        Maldo.hc.v1a1.ch17A.g24066  mdo      compara
## 24 AT5G52820   ath        Maldo.hc.v1a1.ch17A.g24067  mdo      compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  2250109 120.2    4385465 234.3  3000814 160.3
## Vcells 20054457 153.1   32427276 247.5 32422094 247.4

4.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 23046
length(unique(dt$to_geneID))
## [1] 34410
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   20997   77128   32036   56069   30068   37682
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

4.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ComplexUpset package.
##   Please report the issue at
##   <https://github.com/krassowski/complex-upset/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

4.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

4.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

4.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "PLAZA"           "RBH"             "compara"        
##  [9] "from_count"      "to_count"        "count_evidence"  "ath_BINCODE"    
## [13] "ath_NAME"        "ath_DESCRIPTION" "all_pathways"    "short_name"     
## [17] "athName"         "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
##  FALSE   TRUE 
## 117481     27
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
##  [1] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
## [16] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   8153626 435.5   14083611  752.2  14083611  752.2
## Vcells 127008260 969.0  196766967 1501.3 196766967 1501.3

4.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 23180

4.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 23046 
##      # mdo unigue genes: 34410 
##      # ath highest connection degree:  128 
##      # mdo highest connection degree:  116 
##      # genes in ath with degree > 30:  319 
##      # genes in mdo with degree > 30:  288
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          72085          23180          13328           8915
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       36847      10783    10916           7960
##   2        9865       3217     1663            576
##   3        6786       2076      473            178
##   4        6739       2218      150            104
##   5        7352       2776       96             69
##   6        4496       2110       30             28
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID)) # if you want to remove both IDs from pair
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)])) # if you want to remove both IDs from pair
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
cat('####  ####  MapMan4 assignment counts per method ####  ####  \n')
## ####  ####  MapMan4 assignment counts per method ####  ####
print(mapman4_counts)
## OrthoDB_MapMan4 FastOMA_MapMan4     RBH_MapMan4 
##            3003            3905            1103
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

4.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          37256          12293           3640           2267
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        9275       1870     1615           1604
##   2        5288       2021     1354            341
##   3        5006       1595      402            135
##   4        6010       2003      144             95
##   5        7181       2694       95             64
##   6        4496       2110       30             28
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 20470 
##      # mdo unigue genes: 31752 
##      # ath highest connection degree:  59 
##      # mdo highest connection degree:  102 
##      # genes in ath with degree > 30:  28 
##      # genes in mdo with degree > 30:  25
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  "OrthoDB_FastOMA_RBH",
  "OrthoDB_RBH",
  "FastOMA_OrthoDB",
  "FastOMA_RBH",
  "OrthoDB_MapMan4",
  "RBH_MapMan4",
  "FastOMA_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##         filter_criteria Count
##                  <fctr> <int>
##  1:             MCScanX 32036
##  2:             compara  5278
##  3:               PLAZA  6568
##  4: OrthoDB_FastOMA_RBH   579
##  5:         OrthoDB_RBH   667
##  6:     FastOMA_OrthoDB  1240
##  7:         FastOMA_RBH  1077
##  8:     OrthoDB_MapMan4  3003
##  9:         RBH_MapMan4  1103
## 10:     FastOMA_MapMan4  3905
## 11:              reject 62052

4.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                 163                 121                  71                  35 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                1336                  26                 118                  30 
##               PLAZA         RBH_MapMan4              reject 
##                 264                  28                1753
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'ppe'
  , plantNameOut = "peach"
  , plantDirOut = file.path('..', 'reports', group, "peach")
  , flag = 1
)

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000210ad79ddd8>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-38] | |…… | 11% | |…….. | 15% [unnamed-chunk-39] | |……… | 19% | |……….. | 22% [unnamed-chunk-40] | |…………. | 26% | |…………… | 30% [unnamed-chunk-41] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-42] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-43] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-44] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-45] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-46] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-47] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-48] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-49] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-50] | |……………………………………………| 100%

cat(child_content)

5 Subsection: ppe

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

5.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/_OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  16193 44006 17894 76740 20791 24564
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 24 × 5
##    from_geneID from_plant to_geneID      to_plant source 
##    <chr>       <chr>      <chr>          <chr>    <chr>  
##  1 AT1G12040   ath        Prupe.1G000500 ppe      FastOMA
##  2 AT1G62440   ath        Prupe.1G000500 ppe      FastOMA
##  3 AT1G61010   ath        Prupe.I006100  ppe      FastOMA
##  4 AT1G61010   ath        Prupe.I006200  ppe      FastOMA
##  5 AT1G04230   ath        Prupe.1G027200 ppe      MCScanX
##  6 AT1G04240   ath        Prupe.1G027500 ppe      MCScanX
##  7 ATCG00530   ath        Prupe.7G029500 ppe      MCScanX
##  8 ATCG00680   ath        Prupe.7G029800 ppe      MCScanX
##  9 AT3G17900   ath        Prupe.1G267800 ppe      OrthoDB
## 10 AT4G35230   ath        Prupe.1G355500 ppe      OrthoDB
## 11 AT2G07675   ath        Prupe.6G146800 ppe      OrthoDB
## 12 ATMG00980   ath        Prupe.6G146800 ppe      OrthoDB
## 13 AT3G02890   ath        Prupe.2G329000 ppe      PLAZA  
## 14 AT5G16680   ath        Prupe.2G329000 ppe      PLAZA  
## 15 AT4G03170   ath        Prupe.4G260600 ppe      PLAZA  
## 16 AT4G03160   ath        Prupe.4G260600 ppe      PLAZA  
## 17 AT1G01030   ath        Prupe.5G134900 ppe      RBH    
## 18 AT1G01040   ath        Prupe.2G200900 ppe      RBH    
## 19 ATMG01250   ath        Prupe.6G123900 ppe      RBH    
## 20 ATMG01250   ath        Prupe.7G164000 ppe      RBH    
## 21 AT5G01010   ath        Prupe.6G300700 ppe      compara
## 22 AT5G01020   ath        Prupe.6G300300 ppe      compara
## 23 AT1G80940   ath        Prupe.3G103600 ppe      compara
## 24 AT1G80950   ath        Prupe.1G382400 ppe      compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   6073948  324.4   11266889  601.8  14083611  752.2
## Vcells 131792040 1005.5  226873834 1731.0 196766967 1501.3

5.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 23113
length(unique(dt$to_geneID))
## [1] 21245
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB   PLAZA     RBH 
##   16193   44006   17894   38370   20791   24564
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

5.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

5.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

5.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

5.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "PLAZA"           "RBH"             "compara"        
##  [9] "from_count"      "to_count"        "count_evidence"  "ath_BINCODE"    
## [13] "ath_NAME"        "ath_DESCRIPTION" "all_pathways"    "short_name"     
## [17] "athName"         "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
## FALSE  TRUE 
## 71222   231
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
##   [1] "Prupe.I000100" "Prupe.I000200" "Prupe.I000200" "Prupe.I000200"
##   [5] "Prupe.I000200" "Prupe.I000200" "Prupe.I000200" "Prupe.I000300"
##   [9] "Prupe.I000300" "Prupe.I000300" "Prupe.I000400" "Prupe.I000400"
##  [13] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [17] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [21] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [25] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [29] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [33] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [37] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [41] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [45] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
##  [49] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000600"
##  [53] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [57] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [61] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [65] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [69] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [73] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [77] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [81] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [85] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
##  [89] "Prupe.I000600" "Prupe.I000600" "Prupe.I000700" "Prupe.I000700"
##  [93] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
##  [97] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [101] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [105] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [109] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [113] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [117] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [121] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [125] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [129] "Prupe.I000700" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [133] "Prupe.I000800" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [137] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I000900"
## [141] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I001000"
## [145] "Prupe.I001000" "Prupe.I001000" "Prupe.I001000" "Prupe.I001000"
## [149] "Prupe.I001000" "Prupe.I001000" "Prupe.I001100" "Prupe.I001100"
## [153] "Prupe.I001100" "Prupe.I001600" "Prupe.I001700" "Prupe.I001700"
## [157] "Prupe.I001700" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [161] "Prupe.I001800" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [165] "Prupe.I001800" "Prupe.I001800" "Prupe.I001900" "Prupe.I001900"
## [169] "Prupe.I002100" "Prupe.I002100" "Prupe.I002300" "Prupe.I002300"
## [173] "Prupe.I002300" "Prupe.I002400" "Prupe.I002400" "Prupe.I002600"
## [177] "Prupe.I002600" "Prupe.I002800" "Prupe.I002800" "Prupe.I002900"
## [181] "Prupe.I002900" "Prupe.I003000" "Prupe.I003000" "Prupe.I003100"
## [185] "Prupe.I003100" "Prupe.I003100" "Prupe.I003200" "Prupe.I003200"
## [189] "Prupe.I003200" "Prupe.I003300" "Prupe.I003400" "Prupe.I003400"
## [193] "Prupe.I003400" "Prupe.I003400" "Prupe.I003400" "Prupe.I003400"
## [197] "Prupe.I003400" "Prupe.I003800" "Prupe.I003800" "Prupe.I003900"
## [201] "Prupe.I003900" "Prupe.I004000" "Prupe.I004000" "Prupe.I004400"
## [205] "Prupe.I004400" "Prupe.I004400" "Prupe.I004400" "Prupe.I004400"
## [209] "Prupe.I004400" "Prupe.I004500" "Prupe.I004500" "Prupe.I004500"
## [213] "Prupe.I004500" "Prupe.I004500" "Prupe.I004500" "Prupe.I005000"
## [217] "Prupe.I005000" "Prupe.I005100" "Prupe.I005200" "Prupe.I005200"
## [221] "Prupe.I005200" "Prupe.I005200" "Prupe.I005200" "Prupe.I005500"
## [225] "Prupe.I005500" "Prupe.I005500" "Prupe.I005600" "Prupe.I005700"
## [229] "Prupe.I005800" "Prupe.I006100" "Prupe.I006200"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  6340525 338.7   11266889  601.8  14083611  752.2
## Vcells 91829013 700.6  226881563 1731.0 226881563 1731.0

5.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 16428

5.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 23113 
##      # ppe unigue genes: 21245 
##      # ath highest connection degree:  57 
##      # ppe highest connection degree:  115 
##      # genes in ath with degree > 30:  264 
##      # genes in ppe with degree > 30:  135
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          43931          16428           6360           4734
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       21126       7929     5410           4119
##   2        6460       2272      560            368
##   3        4106       1255      185            103
##   4        4182       1339      103             68
##   5        4892       1998       68             51
##   6        3165       1635       34             25
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID)) # if you want to remove both IDs from pair
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)])) # if you want to remove both IDs from pair
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
cat('####  ####  MapMan4 assignment counts per method ####  ####  \n')
## ####  ####  MapMan4 assignment counts per method ####  ####
print(mapman4_counts)
## OrthoDB_MapMan4 FastOMA_MapMan4     RBH_MapMan4 
##            2181            2171             641
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

5.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          23698           8434           1556           1482
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        6038       1402      853           1043
##   2        3413       1431      366            219
##   3        2797        888      143             81
##   4        3607       1168       92             63
##   5        4678       1910       68             51
##   6        3165       1635       34             25
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 19985 
##      # ppe unigue genes: 20047 
##      # ath highest connection degree:  40 
##      # ppe highest connection degree:  102 
##      # genes in ath with degree > 30:  6 
##      # genes in ppe with degree > 30:  15
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  "OrthoDB_FastOMA_RBH",
  "OrthoDB_RBH",
  "FastOMA_OrthoDB",
  "FastOMA_RBH",
  "OrthoDB_MapMan4",
  "RBH_MapMan4",
  "FastOMA_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##         filter_criteria Count
##                  <fctr> <int>
##  1:             MCScanX 17894
##  2:             compara  5084
##  3:               PLAZA  4926
##  4: OrthoDB_FastOMA_RBH   415
##  5:         OrthoDB_RBH   516
##  6:     FastOMA_OrthoDB  1109
##  7:         FastOMA_RBH   233
##  8:     OrthoDB_MapMan4  2181
##  9:         RBH_MapMan4   641
## 10:     FastOMA_MapMan4  2171
## 11:              reject 36283

5.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                 145                  50                  43                   7 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                 728                  27                  48                  16 
##               PLAZA         RBH_MapMan4              reject 
##                 166                  15                1149
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'pdul'
  , plantNameOut = "almond"
  , plantDirOut = file.path('..', 'reports', group, "almond")
  , flag = 2
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000210aba52270>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-66] | |…… | 11% | |…….. | 15% [unnamed-chunk-67] | |……… | 19% | |……….. | 22% [unnamed-chunk-68] | |…………. | 26% | |…………… | 30% [unnamed-chunk-69] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-70] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-71] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-72] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-73] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-74] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-75] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-76] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-77] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-78] | |……………………………………………| 100%

cat(child_content)

6 Subsection: pdul

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

6.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/_OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  15975 42927 20148 71468 24829
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 20 × 5
##    from_geneID from_plant to_geneID      to_plant source 
##    <chr>       <chr>      <chr>          <chr>    <chr>  
##  1 AT2G05620   ath        TexasF1_G1000  pdul     FastOMA
##  2 AT3G48880   ath        TexasF1_G10001 pdul     FastOMA
##  3 AT3G48890   ath        TexasF1_G9999  pdul     FastOMA
##  4 AT5G52240   ath        TexasF1_G9999  pdul     FastOMA
##  5 AT1G04220   ath        TexasF1_G767   pdul     MCScanX
##  6 AT1G04230   ath        TexasF1_G779   pdul     MCScanX
##  7 ATCG00170   ath        TexasF1_G22620 pdul     MCScanX
##  8 ATCG00500   ath        TexasF1_G22624 pdul     MCScanX
##  9 AT1G23390   ath        TexasF1_G3359  pdul     OrthoDB
## 10 AT5G19210   ath        TexasF1_G2060  pdul     OrthoDB
## 11 AT3G51810   ath        TexasF1_G23162 pdul     OrthoDB
## 12 AT2G28815   ath        TexasF1_G6420  pdul     OrthoDB
## 13 AT1G01030   ath        TexasF1_G18833 pdul     RBH    
## 14 AT1G01040   ath        TexasF1_G9057  pdul     RBH    
## 15 ATMG00860   ath        TexasF1_G25095 pdul     RBH    
## 16 ATMG01250   ath        TexasF1_G22012 pdul     RBH    
## 17 AT4G37540   ath        TexasF1_G22582 pdul     compara
## 18 AT5G67420   ath        TexasF1_G22582 pdul     compara
## 19 AT4G15960   ath        TexasF1_G27715 pdul     compara
## 20 AT3G45140   ath        TexasF1_G6796  pdul     compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  5041852 269.3   11266889  601.8  14083611  752.2
## Vcells 94094229 717.9  226881563 1731.0 226881563 1731.0

6.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22451
length(unique(dt$to_geneID))
## [1] 20903
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB     RBH 
##   15975   42927   20148   35734   24829
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

6.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

6.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

6.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

6.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "RBH"             "compara"         "from_count"     
##  [9] "to_count"        "count_evidence"  "ath_BINCODE"     "ath_NAME"       
## [13] "ath_DESCRIPTION" "all_pathways"    "short_name"      "athName"        
## [17] "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
## FALSE 
## 67030
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  4675758 249.8   11266889  601.8  14083611  752.2
## Vcells 59036007 450.5  181505251 1384.8 226881563 1731.0

6.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 14567

6.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 22451 
##      # pdul unigue genes: 20903 
##      # ath highest connection degree:  150 
##      # pdul highest connection degree:  113 
##      # genes in ath with degree > 30:  171 
##      # genes in pdul with degree > 30:  120
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          39774          14567           7625           5064
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       19498       7186     6154           4134
##   2        6174       1991      848            468
##   3        4138       1446      289            176
##   4        4954       1821      187            146
##   5        5010       2123      147            140
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID)) # if you want to remove both IDs from pair
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)])) # if you want to remove both IDs from pair
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
cat('####  ####  MapMan4 assignment counts per method ####  ####  \n')
## ####  ####  MapMan4 assignment counts per method ####  ####
print(mapman4_counts)
## OrthoDB_MapMan4 FastOMA_MapMan4     RBH_MapMan4 
##            2219            2716             826
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

6.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          22360           7078           1865           1879
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        6023        694      690           1162
##   2        3432       1344      600            297
##   3        3215       1187      249            144
##   4        4680       1730      179            136
##   5        5010       2123      147            140
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 19490 
##      # pdul unigue genes: 19029 
##      # ath highest connection degree:  39 
##      # pdul highest connection degree:  102 
##      # genes in ath with degree > 30:  5 
##      # genes in pdul with degree > 30:  13
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  "OrthoDB_FastOMA_RBH",
  "OrthoDB_RBH",
  "FastOMA_OrthoDB",
  "FastOMA_RBH",
  "OrthoDB_MapMan4",
  "RBH_MapMan4",
  "FastOMA_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##         filter_criteria Count
##                  <fctr> <int>
##  1:             MCScanX 20148
##  2:             compara  4234
##  3: OrthoDB_FastOMA_RBH   722
##  4:         OrthoDB_RBH   692
##  5:     FastOMA_OrthoDB  1038
##  6:         FastOMA_RBH   587
##  7:     OrthoDB_MapMan4  2219
##  8:         RBH_MapMan4   826
##  9:     FastOMA_MapMan4  2716
## 10:              reject 33848

6.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                 118                  68                  53                  25 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                 843                  42                  63                  19 
##         RBH_MapMan4              reject 
##                  32                1142
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'pavi'
  , plantNameOut = "wildcherry"
  , plantDirOut = file.path('..', 'reports', group, "wildcherry")
  , flag = 2
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x000002103781bd60>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-94] | |…… | 11% | |……. | 15% [unnamed-chunk-95] | |……… | 19% | |……….. | 22% [unnamed-chunk-96] | |…………. | 26% | |…………… | 30% [unnamed-chunk-97] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-98] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-99] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-100] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-101] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-102] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-103] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-104] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-105] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-106] | |…………………………………………..| 100%

cat(child_content)

7 Subsection: pavi

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

7.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/_OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  4367 45924 19708 76456 25594
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 20 × 5
##    from_geneID from_plant to_geneID  to_plant source 
##    <chr>       <chr>      <chr>      <chr>    <chr>  
##  1 AT1G12040   ath        FUN_000050 pavi     FastOMA
##  2 AT1G62440   ath        FUN_000050 pavi     FastOMA
##  3 AT2G43840   ath        FUN_040335 pavi     FastOMA
##  4 AT2G44050   ath        FUN_040336 pavi     FastOMA
##  5 AT1G04220   ath        FUN_000300 pavi     MCScanX
##  6 AT1G04230   ath        FUN_000316 pavi     MCScanX
##  7 ATMG01190   ath        FUN_026738 pavi     MCScanX
##  8 ATMG00910   ath        FUN_040205 pavi     MCScanX
##  9 AT4G39370   ath        FUN_020728 pavi     OrthoDB
## 10 AT3G06350   ath        FUN_020749 pavi     OrthoDB
## 11 AT4G24220   ath        FUN_029917 pavi     OrthoDB
## 12 AT4G24220   ath        FUN_029968 pavi     OrthoDB
## 13 AT1G01030   ath        FUN_025493 pavi     RBH    
## 14 AT1G01040   ath        FUN_011748 pavi     RBH    
## 15 ATMG01250   ath        FUN_040221 pavi     RBH    
## 16 ATMG01360   ath        FUN_026804 pavi     RBH    
## 17 AT2G22690   ath        FUN_021390 pavi     compara
## 18 AT4G37880   ath        FUN_021390 pavi     compara
## 19 AT4G39770   ath        FUN_021012 pavi     compara
## 20 AT4G21740   ath        FUN_032538 pavi     compara
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  4085657 218.2   11266889  601.8  14083611  752.2
## Vcells 61491393 469.2  181505251 1384.8 226881563 1731.0

7.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22167
length(unique(dt$to_geneID))
## [1] 21950
table(dt$source)
## 
## compara FastOMA MCScanX OrthoDB     RBH 
##    4367   45924   19708   38228   25594
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

7.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

7.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

7.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

7.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "RBH"             "compara"         "from_count"     
##  [9] "to_count"        "count_evidence"  "ath_BINCODE"     "ath_NAME"       
## [13] "ath_DESCRIPTION" "all_pathways"    "short_name"      "athName"        
## [17] "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
## FALSE  TRUE 
## 71643     2
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
## [1] "FUN_040149" "FUN_040149"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  4734923 252.9   11266889  601.8  14083611  752.2
## Vcells 60112170 458.7  181547276 1385.1 226881563 1731.0

7.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 15130

7.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 22167 
##      # pavi unigue genes: 21950 
##      # ath highest connection degree:  122 
##      # pavi highest connection degree:  115 
##      # genes in ath with degree > 30:  242 
##      # genes in pavi with degree > 30:  131
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          41733          15130           8890           5892
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       20986       7805     7489           4881
##   2        7358       2253      946            572
##   3        6191       2096      298            243
##   4        6023       2400      131            163
##   5        1175        576       26             33
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID)) # if you want to remove both IDs from pair
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)])) # if you want to remove both IDs from pair
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
cat('####  ####  MapMan4 assignment counts per method ####  ####  \n')
## ####  ####  MapMan4 assignment counts per method ####  ####
print(mapman4_counts)
## OrthoDB_MapMan4 FastOMA_MapMan4     RBH_MapMan4 
##            2958            3542             967
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

7.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          24400           7051           1933           2488
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        7098        513      732           1692
##   2        4829       1685      770            385
##   3        5345       1909      276            215
##   4        5953       2368      129            163
##   5        1175        576       26             33
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 19167 
##      # pavi unigue genes: 19574 
##      # ath highest connection degree:  60 
##      # pavi highest connection degree:  102 
##      # genes in ath with degree > 30:  9 
##      # genes in pavi with degree > 30:  22
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  "OrthoDB_FastOMA_RBH",
  "OrthoDB_RBH",
  "FastOMA_OrthoDB",
  "FastOMA_RBH",
  "OrthoDB_MapMan4",
  "RBH_MapMan4",
  "FastOMA_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##         filter_criteria Count
##                  <fctr> <int>
##  1:             MCScanX 19708
##  2:             compara  1309
##  3: OrthoDB_FastOMA_RBH  3052
##  4:         OrthoDB_RBH  1307
##  5:     FastOMA_OrthoDB  2133
##  6:         FastOMA_RBH   896
##  7:     OrthoDB_MapMan4  2958
##  8:         RBH_MapMan4   967
##  9:     FastOMA_MapMan4  3542
## 10:              reject 35773

7.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
##             compara     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH 
##                  47                 149                  79                  36 
##             MCScanX OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH 
##                 789                  84                 108                  61 
##         RBH_MapMan4              reject 
##                  26                1370
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'parm'
  , plantNameOut = "apricot"
  , plantDirOut = file.path('..', 'reports', group, "apricot")
  , flag = 3
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x000002100fcadbf8>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-122] | |…… | 11% | |……. | 15% [unnamed-chunk-123] | |……… | 19% | |……….. | 22% [unnamed-chunk-124] | |…………. | 26% | |…………… | 30% [unnamed-chunk-125] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-126] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-127] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-128] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-129] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-130] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-131] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-132] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-133] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-134] | |…………………………………………..| 100%

cat(child_content)

8 Subsection: parm

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

8.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/_OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  43038 18615 104168 25259
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 16 × 5
##    from_geneID from_plant to_geneID       to_plant source 
##    <chr>       <chr>      <chr>           <chr>    <chr>  
##  1 AT1G12040   ath        PruarM.1G000500 parm     FastOMA
##  2 AT1G62440   ath        PruarM.1G000500 parm     FastOMA
##  3 AT2G47410   ath        PruarM.8G368500 parm     FastOMA
##  4 AT4G02060   ath        PruarM.8G368700 parm     FastOMA
##  5 AT1G04400   ath        PruarM.1G051900 parm     MCScanX
##  6 AT1G04410   ath        PruarM.1G052500 parm     MCScanX
##  7 AT5G64410   ath        PruarM.8G146300 parm     MCScanX
##  8 AT5G64410   ath        PruarM.8G146200 parm     MCScanX
##  9 AT5G10270   ath        PruarM.1G279700 parm     OrthoDB
## 10 AT5G64960   ath        PruarM.1G279700 parm     OrthoDB
## 11 AT2G15790   ath        PruarM.8G195500 parm     OrthoDB
## 12 AT4G34660   ath        PruarM.8G163400 parm     OrthoDB
## 13 AT1G01010   ath        PruarM.2G368400 parm     RBH    
## 14 AT1G01030   ath        PruarM.5G193300 parm     RBH    
## 15 ATMG01360   ath        PruarM.4G189900 parm     RBH    
## 16 ATMG01360   ath        PruarM.4G190100 parm     RBH
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  4122851 220.2   11266889  601.8  14083611  752.2
## Vcells 62494882 476.8  181547276 1385.1 226881563 1731.0

8.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22351
length(unique(dt$to_geneID))
## [1] 22551
table(dt$source)
## 
## FastOMA MCScanX OrthoDB     RBH 
##   43038   18615   52084   25259
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

8.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

8.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

8.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

8.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "RBH"             "from_count"      "to_count"       
##  [9] "count_evidence"  "ath_BINCODE"     "ath_NAME"        "ath_DESCRIPTION"
## [13] "all_pathways"    "short_name"      "athName"         "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
## FALSE 
## 84042
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  4075996 217.7   11266889  601.8  14083611  752.2
## Vcells 48022776 366.4  145237821 1108.1 226881563 1731.0

8.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 27414

8.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 22351 
##      # parm unigue genes: 22551 
##      # ath highest connection degree:  267 
##      # parm highest connection degree:  113 
##      # genes in ath with degree > 30:  344 
##      # genes in parm with degree > 30:  392
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          40370          27414          10212           6046
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       20994      20423     8961           4950
##   2        7132       2400      896            600
##   3        6352       2247      227            306
##   4        5892       2344      128            190
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID)) # if you want to remove both IDs from pair
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)])) # if you want to remove both IDs from pair
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
cat('####  ####  MapMan4 assignment counts per method ####  ####  \n')
## ####  ####  MapMan4 assignment counts per method ####  ####
print(mapman4_counts)
## OrthoDB_MapMan4 FastOMA_MapMan4     RBH_MapMan4 
##            3175            3319            1196
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

8.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          23682           6793           1675           2451
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        7527        545      606           1563
##   2        4655       1845      728            426
##   3        5608       2059      213            272
##   4        5892       2344      128            190
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 18948 
##      # parm unigue genes: 18528 
##      # ath highest connection degree:  66 
##      # parm highest connection degree:  102 
##      # genes in ath with degree > 30:  10 
##      # genes in parm with degree > 30:  18
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  "OrthoDB_FastOMA_RBH",
  "OrthoDB_RBH",
  "FastOMA_OrthoDB",
  "FastOMA_RBH",
  "OrthoDB_MapMan4",
  "RBH_MapMan4",
  "FastOMA_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##        filter_criteria Count
##                 <fctr> <int>
## 1:             MCScanX 18615
## 2: OrthoDB_FastOMA_RBH  3716
## 3:         OrthoDB_RBH  1371
## 4:     FastOMA_OrthoDB  2357
## 5:         FastOMA_RBH   852
## 6:     OrthoDB_MapMan4  3175
## 7:         RBH_MapMan4  1196
## 8:     FastOMA_MapMan4  3319
## 9:              reject 49441

8.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
##     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH             MCScanX 
##                 154                 110                  23                 768 
## OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH         RBH_MapMan4 
##                 123                 128                  56                  31 
##              reject 
##                1482
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'pcox'
  , plantNameOut = "pear"
  , plantDirOut = file.path('..', 'reports', group, "pear")
  , flag = 4
)


env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000210a3d35938>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-150] | |…… | 11% | |……. | 15% [unnamed-chunk-151] | |……… | 19% | |……….. | 22% [unnamed-chunk-152] | |…………. | 26% | |…………… | 30% [unnamed-chunk-153] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-154] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-155] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-156] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-157] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-158] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-159] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-160] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-161] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-162] | |…………………………………………..| 100%

cat(child_content)

9 Subsection: pcox

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

9.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/_OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  75969 33983 36090
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 12 × 5
##    from_geneID from_plant to_geneID                     to_plant source 
##    <chr>       <chr>      <chr>                         <chr>    <chr>  
##  1 AT1G20520   ath        Pyrco.da.v2a1.augustus.000230 pcox     FastOMA
##  2 AT1G76210   ath        Pyrco.da.v2a1.augustus.000230 pcox     FastOMA
##  3 AT5G53090   ath        Pyrco.da.v2a1.snap.445350     pcox     FastOMA
##  4 AT5G53100   ath        Pyrco.da.v2a1.snap.445350     pcox     FastOMA
##  5 AT1G03900   ath        Pyrco.da.v2a1.chr10A.089110   pcox     MCScanX
##  6 AT1G03920   ath        Pyrco.da.v2a1.chr10A.089090   pcox     MCScanX
##  7 AT5G56870   ath        Pyrco.da.v2a1.chr9A.225970    pcox     MCScanX
##  8 AT5G57035   ath        Pyrco.da.v2a1.chr9A.225390    pcox     MCScanX
##  9 AT1G01030   ath        Pyrco.da.v2a1.chr14A.371380   pcox     RBH    
## 10 AT1G01030   ath        Pyrco.da.v2a1.chr1A.345960    pcox     RBH    
## 11 ATMG01250   ath        Pyrco.da.v2a1.chr5A.045340    pcox     RBH    
## 12 ATMG01250   ath        Pyrco.da.v2a1.snap.153710     pcox     RBH
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  3663517 195.7   11266889  601.8  14083611  752.2
## Vcells 47066163 359.1  145237821 1108.1 226881563 1731.0

9.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 21603
length(unique(dt$to_geneID))
## [1] 30838
table(dt$source)
## 
## FastOMA MCScanX     RBH 
##   75969   33983   36090
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

9.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

9.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

9.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

9.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "RBH"             "from_count"      "to_count"        "count_evidence" 
##  [9] "ath_BINCODE"     "ath_NAME"        "ath_DESCRIPTION" "all_pathways"   
## [13] "short_name"      "athName"         "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
## FALSE 
## 95885
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  3713411 198.4   11266889  601.8  14083611  752.2
## Vcells 41971878 320.3  145237821 1108.1 226881563 1731.0

9.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 16546

9.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 21603 
##      # pcox unigue genes: 30838 
##      # ath highest connection degree:  136 
##      # pcox highest connection degree:  116 
##      # genes in ath with degree > 30:  261 
##      # genes in pcox with degree > 30:  287
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          60208          16546          10595           8536
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       36081       8117     9410           7873
##   2       12902       4306      975            468
##   3       11225       4123      210            195
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID)) # if you want to remove both IDs from pair
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)])) # if you want to remove both IDs from pair
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
cat('####  ####  MapMan4 assignment counts per method ####  ####  \n')
## ####  ####  MapMan4 assignment counts per method ####  ####
print(mapman4_counts)
## FastOMA_MapMan4     RBH_MapMan4 
##           13297            2564
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

9.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          38785           9730           3471           3470
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       16409       1653     2355           2871
##   2       11151       3954      906            404
##   3       11225       4123      210            195
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 19450 
##      # pcox unigue genes: 28452 
##      # ath highest connection degree:  60 
##      # pcox highest connection degree:  102 
##      # genes in ath with degree > 30:  109 
##      # genes in pcox with degree > 30:  78
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  "OrthoDB_FastOMA_RBH",
  "OrthoDB_RBH",
  "FastOMA_OrthoDB",
  "FastOMA_RBH",
  "OrthoDB_MapMan4",
  "RBH_MapMan4",
  "FastOMA_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##    filter_criteria Count
##             <fctr> <int>
## 1:         MCScanX 33983
## 2:     FastOMA_RBH  5612
## 3:     RBH_MapMan4  2564
## 4: FastOMA_MapMan4 13297
## 5:          reject 40429

9.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
## FastOMA_MapMan4     FastOMA_RBH         MCScanX     RBH_MapMan4          reject 
##             493             175            1459             109            1068
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'pcer'
  , plantNameOut = "cherryplum"
  , plantDirOut = file.path('..', 'reports', group, "cherryplum")
  , flag = 4
)

# note: in compara - geneID and prot ID are completely different

env <- new.env()
list2env(params_list, envir = env)

<environment: 0x00000210a300f0e8>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-178] | |…… | 11% | |……. | 15% [unnamed-chunk-179] | |……… | 19% | |……….. | 22% [unnamed-chunk-180] | |…………. | 26% | |…………… | 30% [unnamed-chunk-181] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-182] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-183] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-184] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-185] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-186] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-187] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-188] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-189] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-190] | |…………………………………………..| 100%

cat(child_content)

10 Subsection: pcer

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

10.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/_OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  162100 63358 80487
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 12 × 5
##    from_geneID from_plant to_geneID      to_plant source 
##    <chr>       <chr>      <chr>          <chr>    <chr>  
##  1 AT2G32760   ath        Pcer_000001-RA pcer     FastOMA
##  2 AT1G07920   ath        Pcer_000002-RA pcer     FastOMA
##  3 AT1G16650   ath        Pcer_097557-RA pcer     FastOMA
##  4 AT1G53530   ath        Pcer_097558-RA pcer     FastOMA
##  5 AT1G04220   ath        Pcer_000137-RA pcer     MCScanX
##  6 AT1G04230   ath        Pcer_000150-RA pcer     MCScanX
##  7 ATMG00080   ath        Pcer_091446-RA pcer     MCScanX
##  8 ATMG00513   ath        Pcer_091469-RA pcer     MCScanX
##  9 AT1G01030   ath        Pcer_027461-RA pcer     RBH    
## 10 AT1G01030   ath        Pcer_038773-RA pcer     RBH    
## 11 ATMG01330   ath        Pcer_091451-RA pcer     RBH    
## 12 ATMG01360   ath        Pcer_096779-RA pcer     RBH
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  3549616 189.6   11266889  601.8  14083611  752.2
## Vcells 45685339 348.6  145237821 1108.1 226881563 1731.0

10.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22197
length(unique(dt$to_geneID))
## [1] 71437
table(dt$source)
## 
## FastOMA MCScanX     RBH 
##  162100   63358   80487
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

10.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

10.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

10.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

10.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "RBH"             "from_count"      "to_count"        "count_evidence" 
##  [9] "ath_BINCODE"     "ath_NAME"        "ath_DESCRIPTION" "all_pathways"   
## [13] "short_name"      "athName"         "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
##  FALSE   TRUE 
## 201293      3
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
## [1] "Pcer_097367-RB" "Pcer_097392-RB" "Pcer_097544-RB"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  4229286 225.9   11266889  601.8  14083611  752.2
## Vcells 57585409 439.4  145338077 1108.9 226881563 1731.0

10.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 40662

10.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 22197 
##      # pcer unigue genes: 71437 
##      # ath highest connection degree:  270 
##      # pcer highest connection degree:  113 
##      # genes in ath with degree > 30:  890 
##      # genes in pcer with degree > 30:  449
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##         126211          40662          19225          15198
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       75114      21956    16707          13585
##   2       29613      10371     1995           1240
##   3       21484       8335      523            373
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID)) # if you want to remove both IDs from pair
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)])) # if you want to remove both IDs from pair
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
cat('####  ####  MapMan4 assignment counts per method ####  ####  \n')
## ####  ####  MapMan4 assignment counts per method ####  ####
print(mapman4_counts)
## FastOMA_MapMan4     RBH_MapMan4 
##           34806            6865
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

10.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          89442          20615           4446           8578
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       41943       2600     2064           7103
##   2       26015       9680     1859           1102
##   3       21484       8335      523            373
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 20165 
##      # pcer unigue genes: 63906 
##      # ath highest connection degree:  209 
##      # pcer highest connection degree:  102 
##      # genes in ath with degree > 30:  461 
##      # genes in pcer with degree > 30:  166
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  "OrthoDB_FastOMA_RBH",
  "OrthoDB_RBH",
  "FastOMA_OrthoDB",
  "FastOMA_RBH",
  "OrthoDB_MapMan4",
  "RBH_MapMan4",
  "FastOMA_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##    filter_criteria Count
##             <fctr> <int>
## 1:         MCScanX 63358
## 2:     FastOMA_RBH 18052
## 3:     RBH_MapMan4  6865
## 4: FastOMA_MapMan4 34806
## 5:          reject 78215

10.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
## FastOMA_MapMan4     FastOMA_RBH         MCScanX     RBH_MapMan4          reject 
##            1418             583            2541             266            2659
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)
params_list <- list(
  plantName = 'psib'
  , plantNameOut = "siberianapricot"
  , plantDirOut = file.path('..', 'reports', group, "siberianapricot")
  , flag = 3
)


env <- new.env()
list2env(params_list, envir = env)

<environment: 0x0000021055ab0768>

child_content <- knitr::knit_child("09_translate-child.Rmd", envir = env, quiet = FALSE)
## 
## 
## processing file: ./09_translate-child.Rmd

| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-206] | |…… | 11% | |……. | 15% [unnamed-chunk-207] | |……… | 19% | |……….. | 22% [unnamed-chunk-208] | |…………. | 26% | |…………… | 30% [unnamed-chunk-209] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-210] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-211] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-212] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-213] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-214] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-215] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-216] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-217] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-218] | |…………………………………………..| 100%

cat(child_content)

11 Subsection: psib

if (!dir.exists(plantDirOut)) dir.create(plantDirOut, recursive = TRUE)

11.1 Ortho sources

fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]

df = NULL

for (i in fl){
  
  print(i)
  
  dt = data.table::fread(i)
  us = unique(dt$source)
  
  if(us == 'compara') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'FastOMA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'MCScanX') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'PLAZA') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'OrthoDB') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  } else if (us == 'RBH') {
    dt = dt[dt$to_plant == plantName, ]
    df = rbind(df, dt)
  }   else cat ('Ignore source(s):', us, '\n')
}
## [1] "../intermediate/_OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator 
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
cat("Pre filter Sources:\n", table(df$source), "\n")
## Pre filter Sources:
##  40732 16398 20889 25288
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID" 


df %>%
  dplyr::group_by(source) %>%
  dplyr::slice_head(n = 2) %>%
  dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
  dplyr::arrange(source) %>%
  dplyr::ungroup() -> first_last_three_per_source

print(first_last_three_per_source, n = nrow(first_last_three_per_source))
## # A tibble: 16 × 5
##    from_geneID from_plant to_geneID         to_plant source 
##    <chr>       <chr>      <chr>             <chr>    <chr>  
##  1 AT1G12040   ath        PaF106G0100000005 psib     FastOMA
##  2 AT1G62440   ath        PaF106G0100000005 psib     FastOMA
##  3 AT3G07140   ath        PaF106G0800032954 psib     FastOMA
##  4 AT3G07140   ath        PaF106G0800032956 psib     FastOMA
##  5 AT1G04230   ath        PaF106G0100000358 psib     MCScanX
##  6 AT1G04240   ath        PaF106G0100000361 psib     MCScanX
##  7 ATCG00340   ath        PaF106G0700028636 psib     MCScanX
##  8 ATCG00350   ath        PaF106G0700028635 psib     MCScanX
##  9 AT2G43200   ath        PaF106G0500020125 psib     OrthoDB
## 10 AT1G52770   ath        PaF106G0300013722 psib     OrthoDB
## 11 AT3G56950   ath        PaF106G0700028984 psib     OrthoDB
## 12 AT5G37530   ath        PaF106G0300012640 psib     OrthoDB
## 13 AT1G01030   ath        PaF106G0500020091 psib     RBH    
## 14 AT1G01040   ath        PaF106G0200009357 psib     RBH    
## 15 ATMG01190   ath        PaF106G0600023358 psib     RBH    
## 16 ATMG01250   ath        PaF106G0700028671 psib     RBH
df = df[!duplicated(df), ]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  3909777 208.9    9013512 481.4  14083611  752.2
## Vcells 63705863 486.1  118542936 904.5 226881563 1731.0

11.2 To wide format

dt = df
length(unique(dt$from_geneID))
## [1] 22040
length(unique(dt$to_geneID))
## [1] 20342
table(dt$source)
## 
## FastOMA MCScanX OrthoDB     RBH 
##   40732   16398   20889   25288
dt[, present := TRUE]

dt.wide = dcast(dt, from_geneID + to_geneID ~ source, value.var = "present", fill = FALSE)

dt.wide = dt.wide[order(dt.wide$from_geneID, dt.wide$to_geneID), ]

11.3 Upset plot

if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}


dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]

hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))

dff = as.data.frame(dt.wide)

upset_plot = upset(
  dff,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods") +
  theme(legend.position = "none")

# Print or/and save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf") # change name

11.4 Gene occurence

# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)

ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)), 
        grep('from_count', colnames(dt.wide)),
        grep('to_count', colnames(dt.wide)), 
        grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]

11.5 In/out PSS

df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)

nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)

openxlsx::write.xlsx(nin, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'), 
                     asTable = TRUE) # change name

11.6 plant gmm

fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]

colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')

colnames(df)
##  [1] "from_geneID"     "to_geneID"       "FastOMA"         "MCScanX"        
##  [5] "OrthoDB"         "RBH"             "from_count"      "to_count"       
##  [9] "count_evidence"  "ath_BINCODE"     "ath_NAME"        "ath_DESCRIPTION"
## [13] "all_pathways"    "short_name"      "athName"         "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))
## 
## FALSE 
## 58143
dt[is.na(dt$plant_BINCODE), ]$to_geneID # check ones with strange ID
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]
rm(list = setdiff(ls(), c("df",
                          "ath.gmm", "ath.annot", 
                          "group", 
                          "plantName", 
                          "plantNameOut", 
                          "plantDirOut",
                          "flag")))

gc()
##            used  (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  3843785 205.3    9013512 481.4  14083611  752.2
## Vcells 42433627 323.8  118542936 904.5 226881563 1731.0

11.7 Translation table

MapMan Mercator matches: first three levels only

df = df[!duplicated(df), ]


compare_bin <- function(athMercator, plantXMercator) {
  # Helper: Split and truncate tokens to first 3 dot-separated levels
  split_tokens <- function(code) {
    if (is.na(code) || trimws(code) == "") return(character(0))
    
    parts <- unlist(strsplit(code, "\\|"))
    tokens <- unlist(strsplit(parts, ";"))
    tokens <- unique(trimws(tokens))
    
    trunc3levels <- function(token) {
      levels <- unlist(strsplit(token, "\\."))
      paste(head(levels, 3), collapse = ".")
    }
    
    unique(sapply(tokens, trunc3levels))
  }
  
  bin_set <- split_tokens(athMercator)
  v4_set <- split_tokens(plantXMercator)
  
  # If both sets are empty, return "no match"
  if (length(bin_set) == 0 && length(v4_set) == 0) {
    return("no match")
  }
  
  # Check for redundant annotation (e.g. "35.2|35.2|35.2")
  v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
  if (length(bin_set) == 1 &&
      length(v4_parts) > 1 &&
      all(split_tokens(plantXMercator) == bin_set)) {
    result <- paste0("100% match based on ", bin_set)
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for exact match
  if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
    result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
    if (result == "100% match based on 35.2") return("bad MapMan")
    return(result)
  }
  
  # Check for partial match
  common_tokens <- intersect(bin_set, v4_set)
  if (length(common_tokens) > 0) {
    return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
  }
  
  return("no match")
}



df = df %>%
  dplyr::rowwise() %>%
  dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name 
  dplyr::ungroup()

table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])
##       
##         35.2
##   35.2 12813

11.8 Filter

cat('####  ####  before filter ####  ####  \n')
## ####  ####  before filter ####  ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 22040 
##      # psib unigue genes: 20342 
##      # ath highest connection degree:  58 
##      # psib highest connection degree:  115 
##      # genes in ath with degree > 30:  135 
##      # genes in psib with degree > 30:  106
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()


if (flag == 1) {
  methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  methods = c("MCScanX", "FastOMA", 'RBH')
}


match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
  dt[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]

ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          35534          12813           6004           3792
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1       18977       6640     5025           3323
##   2        6361       2039      641            312
##   3        6065       2253      249             97
##   4        4131       1881       89             60
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")




if (flag != 4) {
  special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
  special_methods = c("FastOMA", 'RBH')
}

# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))

for (method in methods) {

  base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE & 
               !(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
  add_cond = rep(TRUE, nrow(dt))
  
  if (method %in% special_methods) {
    add_cond = rep(TRUE, nrow(dt))
  }
  
  candidates = which(base_cond & add_cond)
  
  if (length(candidates) > 0) {
    if (method %in% special_methods) {
      for (i in candidates) {
        row = dt[i]
        covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
        count_covered = length(covered_by)
        
        is_candidate = FALSE
        new_criteria = NULL
        
        if (count_covered == 3) {
          is_candidate = TRUE
          new_criteria = "OrthoDB_FastOMA_RBH"
        } else if (count_covered == 2) {
          is_candidate = TRUE
          new_criteria = paste(sort(covered_by), collapse = "_")
        } else if (count_covered == 1) {
          # Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
          if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
            is_candidate = TRUE
            new_criteria = paste0(method, "_MapMan4")
            
            # Increment count for this mapman4 assignment
            mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
          }
        }
        
        if (is_candidate) {
          dt[i, filter_criteria := new_criteria]
          # covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID)) # if you want to remove both IDs from pair
          covered_genes = unique(c(covered_genes, row$to_geneID))
        }
      }
    } else {
      dt[candidates, filter_criteria := method]
      # covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)])) # if you want to remove both IDs from pair
      covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
    }
  }
}

# After the loop, print checkpoint counts for method_MapMan4 assignments
cat('####  ####  MapMan4 assignment counts per method ####  ####  \n')
## ####  ####  MapMan4 assignment counts per method ####  ####
print(mapman4_counts)
## OrthoDB_MapMan4 FastOMA_MapMan4     RBH_MapMan4 
##            1102            5266            1947
df = dt

data.table::fwrite(df, 
                   paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-15.txt'), 
                   sep = '\t')
openxlsx::write.xlsx(df, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-all_2025-09-15.xlsx'), 
                     asTable = TRUE)

11.9 Filtered

rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]

# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]

kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]


par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)

long_dt = rbindlist(lapply(methods, function(method) {
  kept[, .(
    Method = method,
    Match_Type = match_categories,
    Count = c(
      sum(get(method) == TRUE & MapMan4_Match == "no match"),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
      sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
      sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
    )
  )]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
  fcase(
    as.character(Match_Type) == "bad MapMan", "35.2",
    default = as.character(Match_Type)
  ),
  levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
  ggplot2::geom_bar(stat = "identity") +
  ggplot2::labs(title = "MapMan match types count per method (post filter)",
                x = "Method",
                y = "Count",
                fill = "Match Type") +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)] 
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)
## 
##    100% match      bad MapMan       no match partial match  
##          22449           6409           1748           1682
table(dtsub$count_evidence, dtsub$MapMan4_Match)
##    
##     100% match  bad MapMan no match partial match 
##   1        8325        859      878           1318
##   2        4482       1595      540            212
##   3        5511       2074      241             92
##   4        4131       1881       89             60
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
  ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
  levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
       x = "count_evidence",
       y = "Frequency",
       fill = "MapMan4_Match") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria", 
                                     names(kept), value = TRUE)] 
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
                                                    "OrthoDB_FastOMA_RBH",
                                                    "FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH", 
                                                    "OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
                                                    ))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
  labs(
    title = "Selection by method and methods coverage",
    x = "Filter criteria",
    y = "Frequency",
    fill = "MapMan4 Match"
  ) +
  theme_minimal() +
  ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1),
    panel.border = element_rect(color = "black", fill = NA, size = 1),  # border around each facet
  )

ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"), 
                device = "pdf", width = 6, height = 6, units = "in")


openxlsx::write.xlsx(rejected, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'), 
                     asTable = TRUE)


edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
  geneID = names(comp$membership),
  weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
#  # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept, 
                     paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'), 
                     asTable = TRUE)


if (flag == 1) {
  source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {  # make flags
  source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
  source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
  source_cols = c("MCScanX", "FastOMA", 'RBH')
}

# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
  kept,
  intersect = source_cols,
  name = "Source",
  width_ratio = 0.1,
  base_annotations = list(
    'Intersection size' = intersection_size(counts = FALSE) #,
    # 'Intersection ratio' = intersection_ratio()
  ),
  # Sort intersections first by degree (number of sets in intersection) descending,
  # then by intersection size (cardinality) descending within each degree
  sort_intersections_by = c("degree", "cardinality"),
  sort_intersections = "descending") + 
  ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")

# Print or save the plot
print(upset_plot)

ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"), 
       plot = upset_plot, width = 24, height = 6, device = "pdf")



cat('####  ####  after filter ####  ####  \n')
## ####  ####  after filter ####  ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
    "\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
    "\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
    "\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
    "\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
    "\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
    )
##  # ath unigue genes: 18799 
##      # psib unigue genes: 18194 
##      # ath highest connection degree:  43 
##      # psib highest connection degree:  102 
##      # genes in ath with degree > 30:  12 
##      # genes in psib with degree > 30:  24
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
  "MCScanX",
  "compara",
  "PLAZA",
  "OrthoDB_FastOMA_RBH",
  "OrthoDB_RBH",
  "FastOMA_OrthoDB",
  "FastOMA_RBH",
  "OrthoDB_MapMan4",
  "RBH_MapMan4",
  "FastOMA_MapMan4",
  "reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)
##        filter_criteria Count
##                 <fctr> <int>
## 1:             MCScanX 16398
## 2: OrthoDB_FastOMA_RBH  3708
## 3:         OrthoDB_RBH   819
## 4:     FastOMA_OrthoDB  1042
## 5:         FastOMA_RBH  2006
## 6:     OrthoDB_MapMan4  1102
## 7:         RBH_MapMan4  1947
## 8:     FastOMA_MapMan4  5266
## 9:              reject 25855

11.10 PSS kept/rejected

pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss, 
                 df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria", 
                                          names(dt), value = TRUE)],
                 by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)
## 
##     FastOMA_MapMan4     FastOMA_OrthoDB         FastOMA_RBH             MCScanX 
##                 262                  46                  69                 674 
## OrthoDB_FastOMA_RBH     OrthoDB_MapMan4         OrthoDB_RBH         RBH_MapMan4 
##                 104                  69                  33                  87 
##              reject 
##                 816
openxlsx::write.xlsx(pss, 
                     paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'), 
                     asTable = TRUE)

12 SessionInfo

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=Slovenian_Slovenia.utf8  LC_CTYPE=Slovenian_Slovenia.utf8   
## [3] LC_MONETARY=Slovenian_Slovenia.utf8 LC_NUMERIC=C                       
## [5] LC_TIME=Slovenian_Slovenia.utf8    
## 
## time zone: Europe/Warsaw
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ComplexUpset_1.3.3 ggplot2_4.0.0      knitr_1.50         data.table_1.17.8 
## [5] magrittr_2.0.4    
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     crayon_1.5.3       dplyr_1.1.4       
##  [5] compiler_4.5.1     zip_2.3.3          Rcpp_1.1.0         tidyselect_1.2.1  
##  [9] stringr_1.5.2      jquerylib_0.1.4    scales_1.4.0       yaml_2.3.10       
## [13] fastmap_1.2.0      R6_2.6.1           labeling_0.4.3     generics_0.1.4    
## [17] patchwork_1.3.2    igraph_2.1.4       openxlsx_4.2.8     tibble_3.3.0      
## [21] bslib_0.9.0        pillar_1.11.1      RColorBrewer_1.1-3 rlang_1.1.6       
## [25] utf8_1.2.6         cachem_1.1.0       stringi_1.8.7      xfun_0.53         
## [29] sass_0.4.10        S7_0.2.0           cli_3.6.5          withr_3.0.2       
## [33] digest_0.6.37      grid_4.5.1         rstudioapi_0.17.1  lifecycle_1.0.4   
## [37] vctrs_0.6.5        evaluate_1.0.5     glue_1.8.0         farver_2.1.2      
## [41] colorspace_2.1-1   rmarkdown_2.29     tools_4.5.1        pkgconfig_2.0.3   
## [45] htmltools_0.5.8.1